library("ggplot2")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("gender")
## PLEASE NOTE: The method provided by this package must be used cautiously
## and responsibly. Please be sure to see the guidelines and warnings about
## usage in the README or the package documentation.
library("geosphere")
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library("lubridate")
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("leaflet")
# airbnb <- read.csv("AB_NYC_2019.csv")
airbnb <- read.csv("AB_NYC_2019_Gender.csv")
str(airbnb)
## 'data.frame': 48895 obs. of 18 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
## $ name : chr "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
## $ host_id : int 2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
## $ host_name : chr "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ neighbourhood_group : chr "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ neighbourhood : chr "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -74 -73.9 -74 -73.9 ...
## $ room_type : chr "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ last_review : chr "2018-10-19" "2019-05-21" NA "2019-07-05" ...
## $ reviews_per_month : num 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: int 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : int 365 355 365 194 0 129 0 220 0 188 ...
## $ gender : chr "male" "female" "female" "unknown" ...
head(airbnb)
## X id name host_id host_name
## 1 1 2539 Clean & quiet apt home by the park 2787 John
## 2 2 2595 Skylit Midtown Castle 2845 Jennifer
## 3 3 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth
## 4 4 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne
## 5 5 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura
## 6 6 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris
## neighbourhood_group neighbourhood latitude longitude room_type price
## 1 Brooklyn Kensington 40.64749 -73.97237 Private room 149
## 2 Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225
## 3 Manhattan Harlem 40.80902 -73.94190 Private room 150
## 4 Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89
## 5 Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80
## 6 Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200
## minimum_nights number_of_reviews last_review reviews_per_month
## 1 1 9 2018-10-19 0.21
## 2 1 45 2019-05-21 0.38
## 3 3 0 <NA> NA
## 4 1 270 2019-07-05 4.64
## 5 10 9 2018-11-19 0.10
## 6 3 74 2019-06-22 0.59
## calculated_host_listings_count availability_365 gender
## 1 6 365 male
## 2 2 355 female
## 3 1 365 female
## 4 1 194 unknown
## 5 1 0 female
## 6 1 129 male
tail(airbnb)
## X id name
## 48890 48890 36484363 QUIT PRIVATE HOUSE
## 48891 48891 36484665 Charming one bedroom - newly renovated rowhouse
## 48892 48892 36485057 Affordable room in Bushwick/East Williamsburg
## 48893 48893 36485431 Sunny Studio at Historical Neighborhood
## 48894 48894 36485609 43rd St. Time Square-cozy single bed
## 48895 48895 36487245 Trendy duplex in the very heart of Hell's Kitchen
## host_id host_name neighbourhood_group neighbourhood latitude
## 48890 107716952 Michael Queens Jamaica 40.69137
## 48891 8232441 Sabrina Brooklyn Bedford-Stuyvesant 40.67853
## 48892 6570630 Marisol Brooklyn Bushwick 40.70184
## 48893 23492952 Ilgar & Aysel Manhattan Harlem 40.81475
## 48894 30985759 Taz Manhattan Hell's Kitchen 40.75751
## 48895 68119814 Christophe Manhattan Hell's Kitchen 40.76404
## longitude room_type price minimum_nights number_of_reviews
## 48890 -73.80844 Private room 65 1 0
## 48891 -73.94995 Private room 70 2 0
## 48892 -73.93317 Private room 40 4 0
## 48893 -73.94867 Entire home/apt 115 10 0
## 48894 -73.99112 Shared room 55 1 0
## 48895 -73.98933 Private room 90 7 0
## last_review reviews_per_month calculated_host_listings_count
## 48890 <NA> NA 2
## 48891 <NA> NA 2
## 48892 <NA> NA 2
## 48893 <NA> NA 1
## 48894 <NA> NA 6
## 48895 <NA> NA 1
## availability_365 gender
## 48890 163 male
## 48891 9 female
## 48892 36 female
## 48893 27 unknown
## 48894 2 male
## 48895 23 male
summary(airbnb)
## X id name host_id
## Min. : 1 Min. : 2539 Length:48895 Min. : 2438
## 1st Qu.:12224 1st Qu.: 9471945 Class :character 1st Qu.: 7822033
## Median :24448 Median :19677284 Mode :character Median : 30793816
## Mean :24448 Mean :19017143 Mean : 67620011
## 3rd Qu.:36672 3rd Qu.:29152178 3rd Qu.:107434423
## Max. :48895 Max. :36487245 Max. :274321313
##
## host_name neighbourhood_group neighbourhood latitude
## Length:48895 Length:48895 Length:48895 Min. :40.50
## Class :character Class :character Class :character 1st Qu.:40.69
## Mode :character Mode :character Mode :character Median :40.72
## Mean :40.73
## 3rd Qu.:40.76
## Max. :40.91
##
## longitude room_type price minimum_nights
## Min. :-74.24 Length:48895 Min. : 0.0 Min. : 1.00
## 1st Qu.:-73.98 Class :character 1st Qu.: 69.0 1st Qu.: 1.00
## Median :-73.96 Mode :character Median : 106.0 Median : 3.00
## Mean :-73.95 Mean : 152.7 Mean : 7.03
## 3rd Qu.:-73.94 3rd Qu.: 175.0 3rd Qu.: 5.00
## Max. :-73.71 Max. :10000.0 Max. :1250.00
##
## number_of_reviews last_review reviews_per_month
## Min. : 0.00 Length:48895 Min. : 0.010
## 1st Qu.: 1.00 Class :character 1st Qu.: 0.190
## Median : 5.00 Mode :character Median : 0.720
## Mean : 23.27 Mean : 1.373
## 3rd Qu.: 24.00 3rd Qu.: 2.020
## Max. :629.00 Max. :58.500
## NA's :10052
## calculated_host_listings_count availability_365 gender
## Min. : 1.000 Min. : 0.0 Length:48895
## 1st Qu.: 1.000 1st Qu.: 0.0 Class :character
## Median : 1.000 Median : 45.0 Mode :character
## Mean : 7.144 Mean :112.8
## 3rd Qu.: 2.000 3rd Qu.:227.0
## Max. :327.000 Max. :365.0
##
# Create Gender Column (from Piazza)
airbnb$gender <- "unknown"
num <- 1
for (i in airbnb$host_name) {
if (nrow(gender(i)['gender']) == 0) {
airbnb$gender[num] <- "unknown"
}
else if (nrow(gender(i)['gender'] == 1) & gender(i)['gender'] == "male") {
airbnb$gender[num] <- "male"
}
else if (nrow(gender(i)['gender'] == 1) & gender(i)['gender'] == "female") {
airbnb$gender[num] <- "female"
}
num <- num + 1
}
airbnb$gender
# Write New Data File w/ Gender
write.csv(airbnb, "AB_NYC_2019_Gender.csv")
# Create Description Length Column
airbnb$descLength <- nchar(airbnb$name, type = "chars", allowNA = FALSE, keepNA = NA)
# Create Keywords Column
airbnb$keyword <- "NA"
num <- 1
for (i in airbnb$name) {
if ((grepl("cozy", i, fixed = TRUE)) || (grepl("Cozy", i, fixed = TRUE))) {
airbnb$keyword[num] <- "Cozy"
}
else if ((grepl("large", i, fixed = TRUE)) || (grepl("Large", i, fixed = TRUE))) {
airbnb$keyword[num] <- "Large"
}
else if ((grepl("cute", i, fixed = TRUE)) || (grepl("Cute", i, fixed = TRUE))) {
airbnb$keyword[num] <- "Cute"
}
else if ((grepl("beautiful", i, fixed = TRUE)) || (grepl("Beautiful", i, fixed = TRUE))) {
airbnb$keyword[num] <- "Beautiful"
}
else if ((grepl("!", i, fixed = TRUE))) {
airbnb$keyword[num] <- "Exclaimation"
}
num <- num + 1
}
# Assign Factors
airbnb$neighbourhood_group <- as.factor(airbnb$neighbourhood_group)
airbnb$neighbourhood <- as.factor(airbnb$neighbourhood)
airbnb$room_type <- as.factor(airbnb$room_type)
airbnb$last_review <- as.Date(airbnb$last_review)
airbnb$gender <- as.factor(airbnb$gender)
airbnb$keyword <- as.factor(airbnb$keyword)
str(airbnb)
## 'data.frame': 48895 obs. of 20 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
## $ name : chr "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
## $ host_id : int 2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
## $ host_name : chr "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ neighbourhood_group : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 3 2 3 3 2 3 3 3 ...
## $ neighbourhood : Factor w/ 221 levels "Allerton","Arden Heights",..: 109 128 95 42 62 138 14 96 203 36 ...
## $ latitude : num 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -74 -73.9 -74 -73.9 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 2 1 2 1 1 1 2 2 2 1 ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ last_review : Date, format: "2018-10-19" "2019-05-21" ...
## $ reviews_per_month : num 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: int 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : int 365 355 365 194 0 129 0 220 0 188 ...
## $ gender : Factor w/ 3 levels "female","male",..: 2 1 1 3 1 2 2 3 1 2 ...
## $ descLength : int 34 21 35 31 48 41 15 32 34 34 ...
## $ keyword : Factor w/ 6 levels "Beautiful","Cozy",..: 6 6 4 2 6 2 4 5 2 2 ...
genderRates <- tapply(airbnb$price, airbnb$gender, mean, na.rm = TRUE)
barplot(sort(genderRates, decreasing = TRUE), ylab = "Price ($)")
genderAvail <- tapply(airbnb$availability_365, airbnb$gender, mean, na.rm = TRUE)
barplot(sort(genderAvail, decreasing = TRUE), ylab = "Availability (days)")
It seems that on average rental prices are higher for hosts of "unknown" gender (which due to the gender function are often groups of people, such as couples or families). This is closely followed by male hosts, and it seems that female hosts command the lowest rental prices. I assumed that families may charge more due to perception of being safer. However, after comparing the data to availability based on gender, it seems that they are also more available, which I have assumed to mean the listing is booked less. This might indicate that family homes are simply larger and thus more expensive.
By Neighborhood Group
neighbourhoodRates <- tapply(airbnb$price, airbnb$neighbourhood_group, mean, na.rm = TRUE)
barplot(sort(neighbourhoodRates, decreasing = TRUE))
The most expensive listings are in Manhatten, and the least expensive listings are in the Bronx.
By Neighborhood
# Most Expensive
airbnb %>%
group_by(neighbourhood) %>%
summarise(
averagePrice = mean(price, na.rm = TRUE)
) %>%
arrange(desc(averagePrice)) %>%
slice(1:6) %>%
ggplot(aes(x = reorder(neighbourhood, -averagePrice), y = averagePrice)) + geom_col()
## `summarise()` ungrouping output (override with `.groups` argument)
# Least Expensive
airbnb %>%
group_by(neighbourhood) %>%
summarise(
averagePrice = mean(price, na.rm = TRUE)
) %>%
arrange(averagePrice) %>%
slice(1:6) %>%
ggplot(aes(x = reorder(neighbourhood, -averagePrice), y = averagePrice)) + geom_col()
## `summarise()` ungrouping output (override with `.groups` argument)
The most expensive listings are in Fort Wadsworth, and the least expensive listings are in Bull's Head
# Total Number of Reviews
ggplot(data = airbnb, aes(x = number_of_reviews, y = price)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
reviews_num_Rates <- tapply(airbnb$price, airbnb$number_of_reviews, mean, na.rm = TRUE)
barplot(reviews_num_Rates)
Surprisingly, it seems that on average more total reviews leads to lower Airbnb rental prices. This could be caused by bad reviews. However, the most expensive Airbnb listings also tend be those with the most reviews. This leads me to believe that reviews are an indicator of the extremes, really good listings and really bad ones, while average ones are less reviewed.
# Reviews per Month
ggplot(data = airbnb, aes(x = reviews_per_month, y = price)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 10052 rows containing non-finite values (stat_smooth).
reviews_rpm_Rates <- tapply(airbnb$price, airbnb$reviews_per_month, mean, na.rm = TRUE)
barplot(reviews_rpm_Rates, ylim = c(0,1000))
Reviews per month seems to follow a similar trend to total reviews at first with more reviews per month actually correlating to lower rental prices. However, at around 30 reviews per month this trend reverses and actually leads to slightly higher rental prices.
# SEE QUESTION 4
register_google(key = "AIzaSyDr6TG5wIRo6iXXvRbE0rV3n2EPx1jApRc", write = TRUE)
## Replacing old key (AIzaSyDr6TG5wIRo6iXXvRbE0rV3n2EPx1jApRc) with new key in /Users/brianzhao/.Renviron
# Get Rental Info by Price
airbnb_stays <- airbnb %>%
group_by(id) %>%
summarise(lat = as.numeric(latitude),
long = as.numeric(longitude),
name = name,
rental_price = price,
availability = availability_365)
## `summarise()` ungrouping output (override with `.groups` argument)
# Sort by Price
airbnb_stays <- airbnb_stays[order(airbnb_stays$rental_price, decreasing = TRUE),]
# Select 100 Most Expensive Rentals
airbnb_stays <- head(airbnb_stays, n = 100)
# Get Map and Plot Rental Locations
nyc_map <- get_map(location = 'New York City, New York',
maptype ='roadmap', color='bw', source='google', zoom=11)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=New%20York%20City,%20New%20York&zoom=11&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New+York+City,+New+York&key=xxx
# Plot Map - ggmap
## Price
ggmap(nyc_map) +
geom_point(data=airbnb_stays, aes(x=long, y=lat, color=rental_price), size=1, alpha=0.5) +
scale_colour_gradient(high= "red", low= 'green') +
theme(axis.ticks = element_blank(), axis.text = element_blank()) +
xlab('') + ylab('')
## Warning: Removed 2 rows containing missing values (geom_point).
## Availability
ggmap(nyc_map) +
geom_point(data=airbnb_stays, aes(x=long, y=lat, color=availability), size=1, alpha=0.5) +
scale_colour_gradient(high= "red", low= 'green') +
theme(axis.ticks = element_blank(), axis.text = element_blank()) +
xlab('') + ylab('')
## Warning: Removed 2 rows containing missing values (geom_point).
# Plot Map - leaflet
leaflet(airbnb_stays) %>%
addTiles() %>%
addMarkers(popup = paste("Name:", airbnb_stays$name, "<br>", "ID:", airbnb_stays$id, "<br>", "Price: $", airbnb_stays$rental_price)) %>%
setView(-73.96, 40.75, zoom = 10)
## Assuming "long" and "lat" are longitude and latitude, respectively
It seems that the map reveals what we would expect based on our previous analyses. The most expensive rentals are as we expected primarily in Manhatten and some are in Brooklyn. Some patterns I recognized are that the most expensive listings seem to be clustered around geographical landmarks such as near Central Park or the water/beaches. I also plotted the availablity of these stays and most of these are very available (300+ days), and based on analysis I performed on availability, this makes sense as more expensive listings were generally more available.
NOTE: ggmap and leaflet maps are provided to demonstrate/practice both methods, but the data is the same.
ggplot(data = airbnb, aes(x = descLength, y = price)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
It seems that the optimal description length to charge the highest rental prices is around 100 characters. Longer and shorter descriptions generally command lower rental prices. However, another explaination is that more experienced and serious hosts will choose a descriptive yet concise description or listing name, and this happens to fall around 100 characters.
keywordRates <- tapply(airbnb$price, airbnb$keyword, mean, na.rm = TRUE)
par(mar=c(6,4,2,2))
barplot(sort(keywordRates, decreasing = TRUE), las=2)
After looking through the data, I identified some common keywords: Beautiful, Large, Cute, Cozy, and the usage of exclaimation points. It seems that using the keyword "beautiful" and adding exclaimation marks in the description of the listing can lead to a higher rental price. Other popular keywords include "cozy," "cute," and "large," but these keywords seem to actually command a lower rental price than those without any of the keywords. It is possible that using these buzzwords could turn off some renters.
# numEntire <- nrow(airbnb[airbnb$room_type == "Entire home/apt", ])
# numPrivate <- nrow(airbnb[airbnb$room_type == "Private room", ])
# numShared <- nrow(airbnb[airbnb$room_type == "Shared room", ])
ggplot(data = airbnb, aes(x=room_type)) + geom_bar()
Most listings on Airbnb are for entire homes/apts. This is closely followed by private rooms, and shared rooms are by far the least common in NYC.
ggplot(data = airbnb, aes(x=neighbourhood_group, colour = room_type)) + geom_bar()
As we can see most listings are located in Manhattan (#1) and Brooklyn (#2) with the other boroughs being significantly less common rental locations. Although the most common type of stay in Manhatten is entire homes/apts. Private rooms become a much more common option in the other 4 boroughs. This could be due to the fact that in the other boroughs space is very limited and its is more difficult to offer an entire space. Also, based on the previous graph, we know that Manhattan is likely skewing the data slightly.
ggplot(data = airbnb, aes(x = availability_365, y = price)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
availabilityRates <- tapply(airbnb$price, airbnb$availability_365, mean, na.rm = TRUE)
barplot(availabilityRates)
Assumption: Availability is 365 days and for simplicity will assume days that are not available are booked
As expected, listings that are available for a larger percentage of the year are able to charge higher rental prices because they offer renters more flexibility. However, there is a curious dip around 325/365 days a year. This could be because their lack of availablity is centered around holidays or popular travel times, where the Airbnb owner uses the property themselves instead of renting. Another explaination could be that, more expensive rentals are more availablity because their prices are high and less renters can afford them. However, low availability can not be explained in the same manner as the host may just list it for less days.